Reading and Preparing Data
library(tidyverse)
library(scales)
library(viridis)
library(stats)
library(corrplot)
library(car)
library(cluster)
library(plotly)
# Load TB dataset
tb_data <- read.csv("TB_Burden_Country.csv", stringsAsFactors = FALSE)
# Filter for latest available year in dataset
latest_year <- max(tb_data$Year, na.rm = TRUE)
tb_latest <- tb_data %>% filter(Year == latest_year)
# Select and clean relevant columns
tb_clean <- tb_latest %>%
select(
country = `Country.or.territory.name`,
population = `Estimated.total.population.number`,
prevalence_per_100k = `Estimated.prevalence.of.TB..all.forms..per.100.000.population`,
mortality_per_100k = `Estimated.mortality.of.TB.cases..all.forms..excluding.HIV..per.100.000.population`,
incidence_per_100k = `Estimated.incidence..all.forms..per.100.000.population`,
hiv_percent = `Estimated.HIV.in.incident.TB..percent.`,
case_detection_rate = `Case.detection.rate..all.forms...percent`
) %>%
drop_na()
# Rename long country name
tb_clean$country[tb_clean$country == "Democratic People's Republic of Korea"] <- "Korea"
Correlation Analysis
numeric_vars <- tb_clean %>%
select(prevalence_per_100k, mortality_per_100k, incidence_per_100k, hiv_percent, case_detection_rate)
correlation_matrix <- cor(numeric_vars, use = "complete.obs")
corrplot(correlation_matrix, method = "color",
type = "upper", order = "hclust",
addCoef.col = "black", number.cex = 0.7)

Regression Analysis
mortality_model <- lm(mortality_per_100k ~ incidence_per_100k, data = tb_clean)
model_summary <- summary(mortality_model)
conf_intervals <- confint(mortality_model)
regression_plot <- ggplot(tb_clean, aes(x = incidence_per_100k, y = mortality_per_100k)) +
geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "darkred") +
theme_minimal() +
labs(title = "TB Mortality vs. Incidence with Confidence Interval",
subtitle = paste("R² =", round(model_summary$r.squared, 3)),
x = "Incidence per 100k", y = "Mortality per 100k")
regression_plot

model_summary
##
## Call:
## lm(formula = mortality_per_100k ~ incidence_per_100k, data = tb_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.892 -4.555 -3.387 1.407 60.303
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.175168 1.243672 2.553 0.0117 *
## incidence_per_100k 0.090302 0.005303 17.029 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.45 on 151 degrees of freedom
## Multiple R-squared: 0.6576, Adjusted R-squared: 0.6553
## F-statistic: 290 on 1 and 151 DF, p-value: < 2.2e-16
conf_intervals
## 2.5 % 97.5 %
## (Intercept) 0.71792201 5.6324141
## incidence_per_100k 0.07982503 0.1007794
K-Means Clustering
scaled_data <- scale(numeric_vars)
kmeans_result <- kmeans(scaled_data, centers = 3, nstart = 25)
tb_clustered <- tb_clean %>%
mutate(cluster = as.factor(kmeans_result$cluster))
cluster_stats <- tb_clustered %>%
group_by(cluster) %>%
summarize(
count = n(),
mean_incidence = mean(incidence_per_100k),
mean_mortality = mean(mortality_per_100k),
mean_hiv_percent = mean(hiv_percent),
mean_detection = mean(case_detection_rate)
)
cluster_stats
## # A tibble: 3 × 6
## cluster count mean_incidence mean_mortality mean_hiv_percent mean_detection
## <fct> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 110 54.2 4.68 8.67 79.7
## 2 2 32 250. 39.2 11.1 58.7
## 3 3 11 646. 56.2 55.3 53.8
Top 10 TB Incidence Rates
tb_top_10 <- tb_clean %>% arrange(desc(incidence_per_100k)) %>% slice(1:10)
ggplot(tb_top_10, aes(x = reorder(country, -incidence_per_100k), y = incidence_per_100k)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Top 10 TB Incidence Rate by Country", x = "Country", y = "Incidence Rate per 100,000")

Interactive Cluster Plot
plotly_cluster <- ggplot(tb_clustered, aes(x = incidence_per_100k, y = mortality_per_100k, color = cluster, text = country)) +
geom_point(size = 3) +
theme_minimal()
ggplotly(plotly_cluster, tooltip = "text")